library(ggplot2)

in_out_path <- ""

setwd(in_out_path)

## note github.com/wilryh/parrot has updated code for this method
load("./data/muslim_mentions_in_transcripts_coefs.RData")
colnames(pcax) <- gsub("PC", "X", colnames(pcax))

words <- data.frame(
    out.sub.vocab,
    out.sub.vocab.n,
    pcax,
    thecc$scores$xscores[,1:5],
    thecc$scores$yscores[,1:5]
)
names(words)[1:2] <- c("word","n")

words$cluster.x <- kmeans(sweep(thecc$scores$xscores[,1:10], 2,  pcax[1:10], `*`), 10)$cluster
words$cluster.x <- kmeans(thecc$scores$xscores[,1:10], 10)$cluster
## there is a catch-all cluster removed from some figures
max.cluster <- which.max(table(words$cluster.x))

words$cluster.x <- factor(words$cluster.x)
## must double-check this since clusters will change when changing any parameters
levels(words$cluster.x) <- c(
    "Donald Trump/immigration","Other countries","DNC/Khans",
    "Attacks","Other identity groups","Ted Cruz/surveillance","Uncategorized"
)

## total pivot score
themeans <- sqrt(rowSums(thecc$scores$yscores^2))

## rescale pivot words to same as ordinary word scores
words$X1.2 <- (words$X1.2 * thecc$cor[1])
words$X2.2 <- (words$X2.2 * thecc$cor[2])
words$X3.2 <- (words$X3.2 * thecc$cor[3])
words$X4.2 <- (words$X4.2 * thecc$cor[4])

## this prints for latex
d1.n <- head(as.character(words$word)[order(words$X1.2 * (themeans / max(themeans)))], n=15)
d1.n.story <- head(as.character(words$word)[order(words$X1.1 * sqrt(themeans))], n=15)
d1.p <- rev(tail(as.character(words$word)[order(words$X1.2 * (themeans / max(themeans)))], n=15))
d1.p.story <- rev(tail(as.character(words$word)[order(words$X1.1 * sqrt(themeans))], n=15))

d2.n <- head(as.character(words$word)[order(words$X2.2 * (themeans / max(themeans)))], n=15)
d2.n.story <- head(as.character(words$word)[order(words$X2.1 * sqrt(themeans))], n=15)
d2.p <- rev(tail(as.character(words$word)[order(words$X2.2 * (themeans / max(themeans)))], n=15))
d2.p.story <- rev(tail(as.character(words$word)[order(words$X2.2 * sqrt(themeans))], n=15))

d3.n <- head(as.character(words$word)[order(words$X3.2 * (themeans / max(themeans)))], n=15)
d3.n.story <- head(as.character(words$word)[order(words$X3.1 * sqrt(themeans))], n=15)
d3.p <- rev(tail(as.character(words$word)[order(words$X3.2 * (themeans / max(themeans)))], n=15))
d3.p.story <- rev(tail(as.character(words$word)[order(words$X3.1 * sqrt(themeans))], n=15))

d4.n <- head(as.character(words$word)[order(words$X4.2 * (themeans / max(themeans)))], n=15)
d4.n.story <- head(as.character(words$word)[order(words$X4.1 * sqrt(themeans))], n=15)
d4.p <- rev(tail(as.character(words$word)[order(words$X4.2 * (themeans / max(themeans)))], n=15))
d4.p.story <- rev(tail(as.character(words$word)[order(words$X4.1 * sqrt(themeans))], n=15))

d5.n <- head(as.character(words$word)[order(words$X5.2 * (themeans / max(themeans)))], n=15)
d5.n.story <- head(as.character(words$word)[order(words$X5.1 * sqrt(themeans))], n=15)
d5.p <- rev(tail(as.character(words$word)[order(words$X5.2 * (themeans / max(themeans)))], n=15))
d5.p.story <- rev(tail(as.character(words$word)[order(words$X5.1 * sqrt(themeans))], n=15))

## table 1
cat("\n\n\n", paste(apply(data.frame(
    c(d1.p.story),
    c(d1.n.story),
    c(d2.p.story),
    c(d2.n.story),
    c(d5.p.story)
    ), 1, paste, collapse=" & "
                          ), collapse="  \\\\ \n"), "\n")

## table A1
cat("\n\n\n", paste(apply(data.frame(
    c(d1.p.story),
    c(d1.n.story),
    c(d2.p.story),
    c(d2.n.story),
    c(d3.p.story),
    c(d3.n.story),
    c(d4.p.story),
    c(d4.n.story),
    c(d5.p.story),
    c(d5.n.story)
    ), 1, paste, collapse=" & "
                          ), collapse="  \\\\ \n"), "\n")

## table A2
cat("\n\n\n", paste(apply(data.frame(
    c(d1.p),
    c(d1.n),
    c(d2.p),
    c(d2.n),
    c(d3.p),
    c(d3.n),
    c(d4.p),
    c(d4.n),
    c(d5.p),
    c(d5.n)
    ), 1, paste, collapse=" & "
                          ), collapse="  \\\\ \n"), "\n")


## figures A1 and A2

pdf("./figs/muslim_transcript_text_colored_clusters.pdf", width=10)

g <- ggplot(subset(words, n > 50)) +
    geom_text(aes(x=X1.1, y=X2.1,label=word,size=n, color=factor(cluster.x))) +
    xlab("Dim 1") + ylab("Dim 2") +
    guides(
        color=guide_legend(title="Color: text cluster"),
        size=guide_legend(title="Size: N occurrences")
    )
print(g)

g <- ggplot(subset(words, n > 50)) +
    geom_text(aes(x=X1.2, y=X2.2,label=word,size=n, color=factor(cluster.x))) +
    xlab("Dim 1") + ylab("Dim 2") +
    guides(
        color=guide_legend(title="Color: text cluster"),
        size=guide_legend(title="Size: N occurrences")
    )
print(g)

dev.off()
